.libPaths(c("/hpc/packages/minerva-centos7/rpackages/4.2.0/site-library", "/hpc/packages/minerva-centos7/rpackages/bioconductor/3.15", .libPaths()))
library(Seurat)
## Loading required package: SeuratObject
## Loading required package: sp
## 'SeuratObject' was built under R 4.2.0 but the current version is
## 4.3.0; it is recomended that you reinstall 'SeuratObject' as the ABI
## for R may have changed
##
## Attaching package: 'SeuratObject'
## The following object is masked from 'package:base':
##
## intersect
library(patchwork)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
pbmc.data <- Read10X(data.dir = "/sc/arion/projects/ad-omics/ashley/data/aM/outs/per_sample_outs/aM/count/sample_filtered_feature_bc_matrix")
pbmc.m <- CreateSeuratObject(counts = pbmc.data, project = "pbmc_m", min.cells = 3, min.features = 200)
pbmc.m
## An object of class Seurat
## 24658 features across 18951 samples within 1 assay
## Active assay: RNA (24658 features, 0 variable features)
## 1 layer present: counts
Lets check how many cells and features we are starting with.
length(colnames(pbmc.m)) #number of cells
## [1] 18951
length(rownames(pbmc.m)) #number of features
## [1] 24658
a. QC mitochondiral genes. The PercentFeatureSet() calculates the % of counts originating from a set of features - here we are first looking at mitochondiral features, which are genes starting with MT.
b. By using [[ ]], I am adding columns to the pbmc matrix to store this QC data.
pbmc.m[["percent.mt"]] <- PercentageFeatureSet(pbmc.m, pattern = "^MT-")
# Show QC metrics for the first 5 cells
head(pbmc.m@meta.data, 5)
## orig.ident nCount_RNA nFeature_RNA percent.mt
## AAACCTGAGCGATATA-1 pbmc_m 36569 7002 3.040827
## AAACCTGAGCGTAGTG-1 pbmc_m 13800 3631 2.471014
## AAACCTGAGGCAGGTT-1 pbmc_m 6063 2532 3.842982
## AAACCTGAGTCTTGCA-1 pbmc_m 16873 4662 4.065667
## AAACCTGAGTGCCATT-1 pbmc_m 15430 4615 3.000648
library(farver, lib.loc = "/hpc/packages/minerva-centos7/rpackages/4.2.0/site-library")
VlnPlot(pbmc.m, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
plot1 <- FeatureScatter(pbmc.m, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(pbmc.m, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
plot1
plot2
# I will select for %mt under 10% and features greater than 200 to capture alive healthy cells.
pbmc.m <- subset(pbmc.m, subset = nFeature_RNA > 200 & percent.mt < 10)
head(pbmc.m@meta.data)
## orig.ident nCount_RNA nFeature_RNA percent.mt
## AAACCTGAGCGATATA-1 pbmc_m 36569 7002 3.040827
## AAACCTGAGCGTAGTG-1 pbmc_m 13800 3631 2.471014
## AAACCTGAGGCAGGTT-1 pbmc_m 6063 2532 3.842982
## AAACCTGAGTCTTGCA-1 pbmc_m 16873 4662 4.065667
## AAACCTGAGTGCCATT-1 pbmc_m 15430 4615 3.000648
## AAACCTGCAACACCTA-1 pbmc_m 670 460 3.731343
The data is normalized based on the feature (number of genes in a cell) by the total expression. This number is multiplied by 10,000 and then log transformed. The function to do this is “NormalizeData.” The values specied below are the default values of this function.
pbmc.m <- NormalizeData(pbmc.m, normalization.method = "LogNormalize", scale.factor = 10000)
## Normalizing layer: counts
pbmc.m <- FindVariableFeatures(pbmc.m, selection.method = "vst", nfeatures = 2000)
## Finding variable features for layer counts
# Identify the 10 most highly variable genes
top10 <- head(VariableFeatures(pbmc.m), 10)
# plot variable features with and without labels
plot1 <- VariableFeaturePlot(pbmc.m)
plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE, xnudge = 0, ynudge = 0)
plot1
plot2
## Scaling the Data The data is scaled by doing a linear transformation.
The ScaleData function does this by shifting the expression of genes so
that the mean expression becomes 0 and the variance is 1. By default,
only variable genes are scaled. This is changed by features =
all.genes
##Scaling RNA data, we only scale the variable features here for efficiency
all.genes <- rownames(pbmc.m)
pbmc.m <- ScaleData(pbmc.m, vars.to.regress = c("percent.mt"))
## Regressing out percent.mt
## Centering and scaling data matrix
For the first principal components, RunPCA shows the most positive (correlated) and most negative (anticorrelated) genes
pbmc.m <- RunPCA(pbmc.m, features = VariableFeatures(object = pbmc.m))
## PC_ 1
## Positive: IL7R, FP236383.3, TCF7, LTB, AQP3, CD27, PIM2, JAML, CTSS, CCR7
## C1orf162, NCDN, FP671120.4, SAT1, IGFLR1, TIMP1, SNED1, FTL, SESN3, MYC
## CCR4, PASK, LINC00861, AC139720.1, CYSLTR1, BEX3, MAL, FAAH2, ACP5, HERPUD1
## Negative: RRM2, TYMS, MKI67, TUBA1B, STMN1, CDK1, TOP2A, NUSAP1, UBE2C, KIFC1
## PCLAF, ZWINT, ASF1B, TUBB, KNL1, CENPF, ASPM, CCNA2, TPX2, HIST1H1B
## PKMYT1, GTSE1, H2AFX, GGH, SPC25, CDCA2, CDCA8, CKAP2L, HJURP, AURKB
## PC_ 2
## Positive: CTSW, PRF1, KLRD1, NKG7, GZMB, TYROBP, CCL4, CCL3, KLRC1, TRDC
## FCER1G, PLEK, CST7, IL2RB, AOAH, CCL5, FCGR3A, NCAM1, GNLY, KIR2DL4
## GZMA, SRGN, EOMES, TOX, KLRK1, MATK, CD300A, FGR, GSTP1, GZMK
## Negative: IL7R, TIMP1, AQP3, S100A6, ANP32B, SNED1, COTL1, RGCC, MYC, IL9R
## STAT1, TCF7, PDE3B, SELL, MAL, WDR86-AS1, SOS1, HSF4, INPP4B, LMNA
## NME4, FHIT, BEX3, CCR4, CRIP1, KLF3, PRDX1, IMPDH2, LTB, HMGA1
## PC_ 3
## Positive: PLK1, CCNB1, KIF20A, DLGAP5, CENPA, PSRC1, ASPM, CDC20, CCNB2, HMMR
## KIF14, NEK2, CENPF, TROAP, ARL6IP1, CENPE, PIMREG, UBE2C, PIF1, CDCA8
## TOP2A, AURKA, CDCA3, KIF2C, KNSTRN, TPX2, KPNA2, BIRC5, KIF23, GTSE1
## Negative: GINS2, MCM2, MCM6, MCM5, CDC6, MCM7, MCM3, MCM4, PAICS, UNG
## CDC45, HSP90AB1, E2F1, DCTPP1, SLBP, CDCA7, CHAF1B, NME1, DTL, PPP1R14B
## FEN1, HELLS, CCNE2, CDK4, MIF, MSH6, SRM, DUT, C1QBP, POLD2
## PC_ 4
## Positive: FCER1G, CCR7, TCF7, TXK, DHRS3, NCAM1, PTGDR, BACH2, SH2D1B, GSTM2
## SYK, KIR2DL4, CXXC5, KLRF1, ID3, RNF130, XCL2, NDFIP2, CYSLTR1, CTBP2
## LDB2, RAMP1, XCL1, TOX2, TYROBP, AFAP1L2, NCR1, ACTN1, ADGRG3, PTMA
## Negative: S100A6, HLA-DPA1, HLA-DRB1, LAG3, CD74, LYAR, LGALS1, HLA-DQA1, HLA-DRA, GZMH
## HLA-DPB1, FGFBP2, TRDV2, LINC02694, HPGD, THEMIS, FLNA, HOPX, AHNAK, MSC
## TRGV9, ANXA2, CCL5, NKG7, CST7, ALOX5AP, HLA-DQB1, CXCR3, LGALS3, LTB
## PC_ 5
## Positive: IL9R, LGALS1, TNFRSF18, IL2RA, TNFRSF4, TIMP1, CAPG, SOS1, TNFSF10, ANXA2
## LMNA, IL17RB, GATA3, TXN, GAB2, TPM4, NDFIP2, LAT2, CSF2, ACTG1
## CD82, PRDX1, HSPA5, AHI1, IL3RA, SNED1, RBPJ, CCNB1, NQO1, TBXAS1
## Negative: GZMH, LINC00861, FGFBP2, GZMK, TRDV2, CCR7, C1orf21, TRGV9, KLRC4, TCF7
## PLEK, FAM111B, LYAR, CX3CR1, LINC02446, PCNA, LAG3, CXCR4, PRSS23, MYBL2
## CCL5, MCM7, CTNNAL1, PKMYT1, CLSPN, LIG1, TK1, CDT1, DUT, DTL
print(pbmc.m[["pca"]], dims = 1:5, nfeatures = 5)
## PC_ 1
## Positive: IL7R, FP236383.3, TCF7, LTB, AQP3
## Negative: RRM2, TYMS, MKI67, TUBA1B, STMN1
## PC_ 2
## Positive: CTSW, PRF1, KLRD1, NKG7, GZMB
## Negative: IL7R, TIMP1, AQP3, S100A6, ANP32B
## PC_ 3
## Positive: PLK1, CCNB1, KIF20A, DLGAP5, CENPA
## Negative: GINS2, MCM2, MCM6, MCM5, CDC6
## PC_ 4
## Positive: FCER1G, CCR7, TCF7, TXK, DHRS3
## Negative: S100A6, HLA-DPA1, HLA-DRB1, LAG3, CD74
## PC_ 5
## Positive: IL9R, LGALS1, TNFRSF18, IL2RA, TNFRSF4
## Negative: GZMH, LINC00861, FGFBP2, GZMK, TRDV2
The PCA results can be visualized in different ways.
VizDimLoadings(pbmc.m, dims = 1:2, reduction = "pca")
DimPlot(pbmc.m, reduction = "pca") + NoLegend()
DimHeatmap(pbmc.m, dims = 1, cells = 500, balanced = TRUE)
DimHeatmap(pbmc.m, dims = 1:15, cells = 500, balanced = TRUE)
Cells will be clustered based on PCA. How many PC to use is dependent on many factors. For example, if trying to analyze a rare cell subset, you might want to add more PCs. Usually, the first 10 is good to see dimensionality of the data.
ElbowPlot(pbmc.m)
##We select the top 15 PCs for clustering and tSNE based on PCElbowPlot
pbmc.m <- FindNeighbors(pbmc.m, reduction = "pca", dims = 1:15)
## Computing nearest neighbor graph
## Computing SNN
pbmc.m <- FindClusters(pbmc.m, resolution = 0.5, verbose = FALSE)
head(Idents(pbmc.m), 5)
## AAACCTGAGCGATATA-1 AAACCTGAGCGTAGTG-1 AAACCTGAGGCAGGTT-1 AAACCTGAGTCTTGCA-1
## 2 2 1 2
## AAACCTGAGTGCCATT-1
## 9
## Levels: 0 1 2 3 4 5 6 7 8 9 10
pbmc.m <- RunUMAP(pbmc.m, dims = 1:15)
## 10:56:03 UMAP embedding parameters a = 0.9922 b = 1.112
## 10:56:03 Read 18706 rows and found 15 numeric columns
## 10:56:03 Using Annoy for neighbor search, n_neighbors = 30
## 10:56:03 Building Annoy index with metric = cosine, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 10:56:05 Writing NN index file to temp file /tmp/RtmpLV7FNl/filedf1f675edde4
## 10:56:05 Searching Annoy index using 1 thread, search_k = 3000
## 10:56:11 Annoy recall = 100%
## 10:56:12 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 10:56:13 Initializing from normalized Laplacian + noise (using RSpectra)
## 10:56:13 Commencing optimization for 200 epochs, with 794186 positive edges
## 10:56:36 Optimization finished
# note that you can set `label = TRUE` or use the LabelClusters function to help label
# individual clusters
DimPlot(pbmc.m, reduction = "umap", label = TRUE)
pbmc.m <- RunTSNE(pbmc.m, reduction = "pca", dims = 1:15)
DimPlot(pbmc.m, reduction = "tsne", label = TRUE)
Lets check our metadata now of the seurat object to see what has been added.
head(pbmc.m@meta.data)
## orig.ident nCount_RNA nFeature_RNA percent.mt
## AAACCTGAGCGATATA-1 pbmc_m 36569 7002 3.040827
## AAACCTGAGCGTAGTG-1 pbmc_m 13800 3631 2.471014
## AAACCTGAGGCAGGTT-1 pbmc_m 6063 2532 3.842982
## AAACCTGAGTCTTGCA-1 pbmc_m 16873 4662 4.065667
## AAACCTGAGTGCCATT-1 pbmc_m 15430 4615 3.000648
## AAACCTGCAACACCTA-1 pbmc_m 670 460 3.731343
## RNA_snn_res.0.5 seurat_clusters
## AAACCTGAGCGATATA-1 2 2
## AAACCTGAGCGTAGTG-1 2 2
## AAACCTGAGGCAGGTT-1 1 1
## AAACCTGAGTCTTGCA-1 2 2
## AAACCTGAGTGCCATT-1 9 9
## AAACCTGCAACACCTA-1 0 0
#We now have seurat clusters and RNA_snn_res.0.5 columns added.
# find markers for every cluster compared to all remaining cells, report only the positive
# ones
markers_m <- FindAllMarkers(pbmc.m, only.pos = TRUE)
## Calculating cluster 0
## Calculating cluster 1
## Calculating cluster 2
## Calculating cluster 3
## Calculating cluster 4
## Calculating cluster 5
## Calculating cluster 6
## Calculating cluster 7
## Calculating cluster 8
## Calculating cluster 9
## Calculating cluster 10
VlnPlot will display the differential expression across the clusters. For example, I am looking here at CD8A and CD4 expression in the clusters.
VlnPlot(pbmc.m, features = c("CD8A", "CD4"))
The raw counts can also be shown instead by adding some parameters.
VlnPlot(pbmc.m, features = c("CD8A", "CD4"), slot = "counts", log = TRUE)
FeaturePlot(pbmc.m, features = c("MS4A1", "GNLY", "CD3E", "CD14", "FCER1A", "FCGR3A", "LYZ", "PPBP",
"CD8A"))
.libPaths(c("/hpc/packages/minerva-centos7/rpackages/4.2.0/site-library", "/hpc/packages/minerva-centos7/rpackages/bioconductor/3.15", .libPaths()))
library(patchwork)
library(tidyr)
m_demuxlet = read.delim("/sc/arion/projects/ad-omics/ashley/data/aM.best", header = T, stringsAsFactors = F, check.names = F)
head(m_demuxlet)
## BARCODE RD.TOTL RD.PASS RD.UNIQ N.SNP
## 1 AAACCTGAGCGATATA-1 108714 15696 12405 5336
## 2 AAACCTGAGCGTAGTG-1 32770 4639 4030 1659
## 3 AAACCTGAGGCAGGTT-1 17692 2237 1768 1172
## 4 AAACCTGAGTCTTGCA-1 50476 6748 5402 2761
## 5 AAACCTGAGTGCCATT-1 44258 6139 4879 2498
## 6 AAACCTGCAACACCTA-1 2652 251 207 163
## BEST SNG.1ST
## 1 DBL-GSA8_0_NYUMD0327-01-GSA8_0_NYUMD0334-01-0.500 GSA8_0_NYUMD0334-01
## 2 SNG-GSA8_0_NYUMD0315-01 GSA8_0_NYUMD0315-01
## 3 SNG-GSA8_0_NYUMD0327-01 GSA8_0_NYUMD0327-01
## 4 SNG-GSA8_0_NYUMD0315-01 GSA8_0_NYUMD0315-01
## 5 SNG-GSA8_0_NYUMD0315-01 GSA8_0_NYUMD0315-01
## 6 SNG-GSA8_0_NYUMD0327-01 GSA8_0_NYUMD0327-01
## SNG.LLK1 SNG.2ND SNG.LLK2 SNG.LLK0 DBL.1ST
## 1 -3624.4315 GSA8_0_NYUMD0327-01 -4035.0095 -3409.1657 GSA8_0_NYUMD0327-01
## 2 -592.3739 GSA8_0_NYUMD0289-01 -1650.8367 -986.3261 GSA8_0_NYUMD0289-01
## 3 -419.2268 GSA8_0_NYUMD0289-01 -965.4518 -628.2189 GSA8_0_NYUMD0315-01
## 4 -925.1157 GSA8_0_NYUMD0289-01 -2551.6585 -1551.8233 GSA8_0_NYUMD0315-01
## 5 -860.0781 GSA8_0_NYUMD0334-01 -2364.5970 -1443.7875 GSA8_0_NYUMD0315-01
## 6 -55.5857 GSA8_0_NYUMD0315-01 -164.0652 -104.5352 GSA8_0_NYUMD0315-01
## DBL.2ND ALPHA LLK12 LLK1 LLK2 LLK10
## 1 GSA8_0_NYUMD0334-01 0.5 -2581.7203 -4035.0095 -3624.4315 -3643.5020
## 2 GSA8_0_NYUMD0315-01 0.5 -933.2834 -1650.8367 -592.3739 -1677.4911
## 3 GSA8_0_NYUMD0327-01 0.5 -564.1035 -994.3223 -419.2268 -840.8888
## 4 GSA8_0_NYUMD0327-01 0.5 -1447.4708 -925.1157 -2570.0140 -1455.0858
## 5 GSA8_0_NYUMD0334-01 0.5 -1359.2728 -860.0781 -2364.5970 -1390.1505
## 6 GSA8_0_NYUMD0327-01 0.5 -82.1285 -164.0652 -55.5857 -141.7246
## LLK20 LLK00 PRB.DBL PRB.SNG1
## 1 -3476.9482 -3089.7874 1.00e+00 NaN
## 2 -933.2834 -1028.3060 2.94e-149 1
## 3 -565.5310 -627.3371 4.98e-64 1
## 4 -2278.0753 -1610.0639 4.65e-228 1
## 5 -2162.2396 -1504.2012 5.32e-218 1
## 6 -86.4408 -103.7608 1.00e-12 1
To edit: I will split the Best column into multiple columns.
m_demuxlet_edit = m_demuxlet %>%
mutate(BEST = gsub("-01","", BEST)) %>%
separate(BEST, into=c("DMX_classification.global","DMX_maxID","DMX_secondID"), sep="-") %>%
separate(DMX_maxID, into=c("DMX_garbage1","DMX_garbage2","DMX_maxID"), sep ="_") %>%
separate(DMX_secondID, into=c("DMX_garbage3","DMX_garbage4","DMX_secondID"), sep ="_") %>%
select(-contains("garbage"))
## Warning: Expected 3 pieces. Additional pieces discarded in 2409 rows [1, 8, 13,
## 16, 17, 20, 43, 48, 63, 69, 73, 81, 95, 112, 114, 129, 136, 141, 172, 202,
## ...].
## Warning: Expected 3 pieces. Missing pieces filled with `NA` in 16725 rows [2,
## 3, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 18, 19, 21, 22, 23, 24, 25, 26, ...].
head(m_demuxlet_edit)
## BARCODE RD.TOTL RD.PASS RD.UNIQ N.SNP DMX_classification.global
## 1 AAACCTGAGCGATATA-1 108714 15696 12405 5336 DBL
## 2 AAACCTGAGCGTAGTG-1 32770 4639 4030 1659 SNG
## 3 AAACCTGAGGCAGGTT-1 17692 2237 1768 1172 SNG
## 4 AAACCTGAGTCTTGCA-1 50476 6748 5402 2761 SNG
## 5 AAACCTGAGTGCCATT-1 44258 6139 4879 2498 SNG
## 6 AAACCTGCAACACCTA-1 2652 251 207 163 SNG
## DMX_maxID DMX_secondID SNG.1ST SNG.LLK1 SNG.2ND
## 1 NYUMD0327 NYUMD0334 GSA8_0_NYUMD0334-01 -3624.4315 GSA8_0_NYUMD0327-01
## 2 NYUMD0315 <NA> GSA8_0_NYUMD0315-01 -592.3739 GSA8_0_NYUMD0289-01
## 3 NYUMD0327 <NA> GSA8_0_NYUMD0327-01 -419.2268 GSA8_0_NYUMD0289-01
## 4 NYUMD0315 <NA> GSA8_0_NYUMD0315-01 -925.1157 GSA8_0_NYUMD0289-01
## 5 NYUMD0315 <NA> GSA8_0_NYUMD0315-01 -860.0781 GSA8_0_NYUMD0334-01
## 6 NYUMD0327 <NA> GSA8_0_NYUMD0327-01 -55.5857 GSA8_0_NYUMD0315-01
## SNG.LLK2 SNG.LLK0 DBL.1ST DBL.2ND ALPHA
## 1 -4035.0095 -3409.1657 GSA8_0_NYUMD0327-01 GSA8_0_NYUMD0334-01 0.5
## 2 -1650.8367 -986.3261 GSA8_0_NYUMD0289-01 GSA8_0_NYUMD0315-01 0.5
## 3 -965.4518 -628.2189 GSA8_0_NYUMD0315-01 GSA8_0_NYUMD0327-01 0.5
## 4 -2551.6585 -1551.8233 GSA8_0_NYUMD0315-01 GSA8_0_NYUMD0327-01 0.5
## 5 -2364.5970 -1443.7875 GSA8_0_NYUMD0315-01 GSA8_0_NYUMD0334-01 0.5
## 6 -164.0652 -104.5352 GSA8_0_NYUMD0315-01 GSA8_0_NYUMD0327-01 0.5
## LLK12 LLK1 LLK2 LLK10 LLK20 LLK00 PRB.DBL
## 1 -2581.7203 -4035.0095 -3624.4315 -3643.5020 -3476.9482 -3089.7874 1.00e+00
## 2 -933.2834 -1650.8367 -592.3739 -1677.4911 -933.2834 -1028.3060 2.94e-149
## 3 -564.1035 -994.3223 -419.2268 -840.8888 -565.5310 -627.3371 4.98e-64
## 4 -1447.4708 -925.1157 -2570.0140 -1455.0858 -2278.0753 -1610.0639 4.65e-228
## 5 -1359.2728 -860.0781 -2364.5970 -1390.1505 -2162.2396 -1504.2012 5.32e-218
## 6 -82.1285 -164.0652 -55.5857 -141.7246 -86.4408 -103.7608 1.00e-12
## PRB.SNG1
## 1 NaN
## 2 1
## 3 1
## 4 1
## 5 1
## 6 1
table(m_demuxlet_edit$DMX_classification.global) #num of singlets and doublets
##
## AMB DBL SNG
## 10 2399 16725
table(m_demuxlet_edit$DMX_maxID) #number of cells identified as each donor
##
## NYUMD0289 NYUMD0315 NYUMD0327 NYUMD0334
## 4972 4523 6209 3430
table(m_demuxlet_edit[,c("DMX_classification.global","DMX_maxID")]) #number of singlets or doublets identified as each donor
## DMX_maxID
## DMX_classification.global NYUMD0289 NYUMD0315 NYUMD0327 NYUMD0334
## AMB 2 2 3 3
## DBL 1134 787 458 20
## SNG 3836 3734 5748 3407
m_demuxlet_edit.subset <- m_demuxlet_edit[m_demuxlet_edit$BARCODE %in% colnames(pbmc.m),]
pbmc.m@meta.data <- cbind(pbmc.m@meta.data,m_demuxlet_edit.subset$DMX_maxID,m_demuxlet_edit.subset$DMX_classification.global, m_demuxlet_edit.subset$BARCODE)
head(pbmc.m@meta.data)
## orig.ident nCount_RNA nFeature_RNA percent.mt
## AAACCTGAGCGATATA-1 pbmc_m 36569 7002 3.040827
## AAACCTGAGCGTAGTG-1 pbmc_m 13800 3631 2.471014
## AAACCTGAGGCAGGTT-1 pbmc_m 6063 2532 3.842982
## AAACCTGAGTCTTGCA-1 pbmc_m 16873 4662 4.065667
## AAACCTGAGTGCCATT-1 pbmc_m 15430 4615 3.000648
## AAACCTGCAACACCTA-1 pbmc_m 670 460 3.731343
## RNA_snn_res.0.5 seurat_clusters
## AAACCTGAGCGATATA-1 2 2
## AAACCTGAGCGTAGTG-1 2 2
## AAACCTGAGGCAGGTT-1 1 1
## AAACCTGAGTCTTGCA-1 2 2
## AAACCTGAGTGCCATT-1 9 9
## AAACCTGCAACACCTA-1 0 0
## m_demuxlet_edit.subset$DMX_maxID
## AAACCTGAGCGATATA-1 NYUMD0327
## AAACCTGAGCGTAGTG-1 NYUMD0315
## AAACCTGAGGCAGGTT-1 NYUMD0327
## AAACCTGAGTCTTGCA-1 NYUMD0315
## AAACCTGAGTGCCATT-1 NYUMD0315
## AAACCTGCAACACCTA-1 NYUMD0327
## m_demuxlet_edit.subset$DMX_classification.global
## AAACCTGAGCGATATA-1 DBL
## AAACCTGAGCGTAGTG-1 SNG
## AAACCTGAGGCAGGTT-1 SNG
## AAACCTGAGTCTTGCA-1 SNG
## AAACCTGAGTGCCATT-1 SNG
## AAACCTGCAACACCTA-1 SNG
## m_demuxlet_edit.subset$BARCODE
## AAACCTGAGCGATATA-1 AAACCTGAGCGATATA-1
## AAACCTGAGCGTAGTG-1 AAACCTGAGCGTAGTG-1
## AAACCTGAGGCAGGTT-1 AAACCTGAGGCAGGTT-1
## AAACCTGAGTCTTGCA-1 AAACCTGAGTCTTGCA-1
## AAACCTGAGTGCCATT-1 AAACCTGAGTGCCATT-1
## AAACCTGCAACACCTA-1 AAACCTGCAACACCTA-1
The Seurat Object has already been preprocessed in my case, so this should be clean)
pbmc.m[["percent.mt"]] <- PercentageFeatureSet(pbmc.m, pattern = "^MT-")
library(cowplot)
##
## Attaching package: 'cowplot'
## The following object is masked from 'package:patchwork':
##
## align_plots
# look at distribution of metrics by classification
plot_grid(VlnPlot(pbmc.m, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3, group.by = "m_demuxlet_edit.subset$DMX_maxID"))
VlnPlot(pbmc.m, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3, group.by = "m_demuxlet_edit.subset$DMX_classification.global")
Now, select for MT content under 10% and for nfeatureRNA > 200 to ensure getting alive cells. I previously did this, so should see no change in cell numnbers.
pbmc.m <- subset(pbmc.m, subset = nFeature_RNA > 200 & percent.mt < 10)
length(colnames(pbmc.m))
## [1] 18706
length(rownames(pbmc.m))
## [1] 24658
Add column to meta data to identify seurat object as Basline condition. Also rename some columns for clarity purposes.
pbmc.m@meta.data$condition <- 'Monomer'
names(pbmc.m@meta.data)[names(pbmc.m@meta.data) == "m_demuxlet_edit.subset$DMX_classification.global"] <- "DMX_classification.global"
names(pbmc.m@meta.data)[names(pbmc.m@meta.data) == "m_demuxlet_edit.subset$DMX_maxID"] <- "DMX_maxID"
names(pbmc.m@meta.data)[names(pbmc.m@meta.data) == "m_demuxlet_edit.subset$BARCODE"] <- "Barcode"
head(pbmc.m@meta.data)
## orig.ident nCount_RNA nFeature_RNA percent.mt
## AAACCTGAGCGATATA-1 pbmc_m 36569 7002 3.040827
## AAACCTGAGCGTAGTG-1 pbmc_m 13800 3631 2.471014
## AAACCTGAGGCAGGTT-1 pbmc_m 6063 2532 3.842982
## AAACCTGAGTCTTGCA-1 pbmc_m 16873 4662 4.065667
## AAACCTGAGTGCCATT-1 pbmc_m 15430 4615 3.000648
## AAACCTGCAACACCTA-1 pbmc_m 670 460 3.731343
## RNA_snn_res.0.5 seurat_clusters DMX_maxID
## AAACCTGAGCGATATA-1 2 2 NYUMD0327
## AAACCTGAGCGTAGTG-1 2 2 NYUMD0315
## AAACCTGAGGCAGGTT-1 1 1 NYUMD0327
## AAACCTGAGTCTTGCA-1 2 2 NYUMD0315
## AAACCTGAGTGCCATT-1 9 9 NYUMD0315
## AAACCTGCAACACCTA-1 0 0 NYUMD0327
## DMX_classification.global Barcode condition
## AAACCTGAGCGATATA-1 DBL AAACCTGAGCGATATA-1 Monomer
## AAACCTGAGCGTAGTG-1 SNG AAACCTGAGCGTAGTG-1 Monomer
## AAACCTGAGGCAGGTT-1 SNG AAACCTGAGGCAGGTT-1 Monomer
## AAACCTGAGTCTTGCA-1 SNG AAACCTGAGTCTTGCA-1 Monomer
## AAACCTGAGTGCCATT-1 SNG AAACCTGAGTGCCATT-1 Monomer
## AAACCTGCAACACCTA-1 SNG AAACCTGCAACACCTA-1 Monomer
saveRDS(pbmc.m, file = "/sc/arion/projects/ad-omics/ashley/PD_Stim/pbmc.monomer.final.RDS")